library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
The code chunk below reads in the final project data.
df <- readr::read_csv("fall2022_finalproject.csv", col_names = TRUE)
## Rows: 1252 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): m
## dbl (10): x1, x2, x3, x4, v1, v2, v3, v4, v5, output
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A glimpse is shown below that reveals the number of rows and also shows some of the representative values for the columns.
df %>% glimpse()
## Rows: 1,252
## Columns: 11
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0.031073, 0.0…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0.027119, 0.0…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0.260915, 0.0…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0.055192, 0.0…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9.977407, 0.2…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0.532436, 1.0…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1.298797, 1.1…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0.857509, 0.6…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0.958574, 0.2…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "A"…
## $ output <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.986, 0.874, …
We will start by first looking at each individual feature in relation to the output. To do so, we will use a number of dot plots to initially examine the data. Each plot also includes a line to help better view the data. For each variable, we will also consider its relationship with the categorical input m by using a facet wrap. Since we want to know whether the event occurred or not as well, we will also add a new value representing the event or non-event and color the points using that value.
df <- df %>% mutate(binary = case_when(output < .33 ~ 'Event',
output >=.33 ~ 'Non-Event'))
df %>% ggplot(mapping = aes(y = output,x = x1)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m) +
ggtitle("Variable x1")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = x2)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m) +
ggtitle("Variable x2")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = x3)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m) +
ggtitle("Variable x3")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = x4)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Variable x4")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = v1)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Variable v1")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = v2)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Variable v2")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = v3)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Variable v3")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = v4)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Variable v4")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df %>% ggplot(mapping = aes(y = output,x = v5)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Variable v5")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Since M is categorical, we will use a box plot to visualize it.
df %>% ggplot(mapping = aes(y = output,x = m)) +
geom_boxplot(color = "Red")+
ggtitle("Variable M")
Now we will examine the distributions of the variables in the data set using histograms. As in the previous section, we will also examine the potential impacts of the categorical variable m by using a facet wrap.
df %>% ggplot(mapping = aes(x = x1)) + geom_histogram(bins = 20) + facet_wrap(~m) +
ggtitle("Distribution of x1")
df %>% ggplot(mapping = aes(x = x2)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of x2")
df %>% ggplot(mapping = aes(x = x3)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of x3")
df %>% ggplot(mapping = aes(x = x4)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of x4")
df %>% ggplot(mapping = aes(x = v1)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of v1")
df %>% ggplot(mapping = aes(x = v2)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of v2")
df %>% ggplot(mapping = aes(x = v3)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of v3")
df %>% ggplot(mapping = aes(x = v4)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of v4")
df %>% ggplot(mapping = aes(x = v5)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of v5")
We will also examine the output in the same way.
df %>% ggplot(mapping = aes(x = output)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of output")
And we will view the distrubiton of the categorical variable m itself using a bar plot.
df %>% ggplot(mapping = aes(x = m)) + geom_bar()+
ggtitle("Distribution of M")
Looking at these initial graphs, we can see a number of potentially interesting features to keep an eye on. First of all, the categorical variable m clearly has a small but noticeable impact on all other variables. One of these that stands out initially is that when m has value b, there is a noticeable change to the distribution of the output compared to other values. We can also see that the different variables have a number of different distributions. Looking at the histograms, we see that none of them look to be particularly normally distributed. V4 is the closest to a normal distribution and some of the variables clearly lean one way or another or have large spikes at certain values.
Now we will look at the specific derived features that subject matter experts from the company listed as important. To do so, we will first create a new data frame with those features. We will also create a column for our logit-transformed output.
df_derived <- df %>% mutate(x5 = (1 - (x1 + x2 + x3 + x4))) %>%
mutate(w = x2 / (x3 + x4)) %>%
mutate(z = (x1 + x2) / (x4 + x5)) %>%
mutate(t = v1*v2) %>%
mutate(transformed_output = boot::logit(output))
A glimpse is shown below that reveals the number of rows and also shows some of the representative values for the columns.
df_derived %>% glimpse()
## Rows: 1,252
## Columns: 17
## $ x1 <dbl> 0.025878, 0.030768, 0.019325, 0.306212, 0.031296, 0…
## $ x2 <dbl> 0.255934, 0.261575, 0.020877, 0.033379, 0.259342, 0…
## $ x3 <dbl> 0.492830, 0.498460, 0.258360, 0.255385, 0.264387, 0…
## $ x4 <dbl> 0.012770, 0.055779, 0.012424, 0.056190, 0.056594, 0…
## $ v1 <dbl> 0.275651, 0.343204, 4.998508, 5.090153, 5.031107, 9…
## $ v2 <dbl> 0.033657, 0.027082, 0.030259, 0.052342, 0.517705, 0…
## $ v3 <dbl> 1.166214, 1.260579, 1.298285, 1.322005, 1.368195, 1…
## $ v4 <dbl> 0.408402, 0.664248, 0.412870, 0.652111, 0.533701, 0…
## $ v5 <dbl> 0.525226, 2.866343, 0.409007, 0.861594, 6.451933, 0…
## $ m <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A", "A", "…
## $ output <dbl> 0.786, 0.730, 0.996, 0.326, 0.735, 0.954, 0.969, 0.…
## $ binary <chr> "Non-Event", "Non-Event", "Non-Event", "Event", "No…
## $ x5 <dbl> 0.212588, 0.153418, 0.689014, 0.348834, 0.388381, 0…
## $ w <dbl> 0.50619858, 0.47195344, 0.07709835, 0.10712990, 0.8…
## $ z <dbl> 1.25050808, 1.39745312, 0.05731369, 0.83844661, 0.6…
## $ t <dbl> 0.009277586, 0.009294651, 0.151249854, 0.266428788,…
## $ transformed_output <dbl> 1.3009808, 0.9946226, 5.5174529, -0.7263327, 1.0201…
Now that we have our features, we will view their distributions using histograms. As before, we will examine the potential impact of m (what machine is used during production) using a facet wrap.
df_derived %>% ggplot(mapping = aes(x = x5)) + geom_histogram(bins = 20) + facet_wrap(~m) +
ggtitle("Distribution of x5")
df_derived %>% ggplot(mapping = aes(x = w)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of w")
df_derived %>% ggplot(mapping = aes(x = z)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of z")
df_derived %>% ggplot(mapping = aes(x = t)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of t")
df_derived %>% ggplot(mapping = aes(x = transformed_output)) + geom_histogram(bins = 20) + facet_wrap(~m)+
ggtitle("Distribution of the transformed output")
We will also use dot plots to view the relationships between the derived features and the output as we did above for the inital features.
df_derived %>% ggplot(mapping = aes(y = output,x = x5)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Derived variable x5")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df_derived %>% ggplot(mapping = aes(y = output,x = w)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Derived variable w")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df_derived %>% ggplot(mapping = aes(y = output,x = z)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Derived variable z")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
df_derived %>% ggplot(mapping = aes(y = output,x = t)) +
geom_point(mapping = aes(color = binary)) +
geom_smooth() +
facet_wrap(~m)+
ggtitle("Derived variable t")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
We also want to see if any of the features are related to one another. For this we can use a corrplot, removing
df_for_relationships <- subset(df_derived, select=c(x1,x2,x3,x4,x5,v1,v2,v3,v4,v5,w,z,t))
df_for_relationships %>% cor() %>% corrplot::corrplot(type = "upper", method = 'square')
From this we can see that some of the features are correlated with others. In some cases, this is not very useful as we know some features are derived from others. For instance, the strong relationships between t and v1 and v2 are useless to us since t = v1*v2. In other cases, this is good to know for the future. For instance, we can see that x3 and x4 have a relatively strong positive relationship, while x1 and x3 have a similar negetive relationship.
Now we have a much better idea of what the data looks like. We have visualized the distributions of all the inputs, derived features, the output, and the transformed output using histograms. We have grouped this data by the categorical input m and found some differences in the results based on what machine was used. We have visualized the relationships between the output and inputs using dot plots and simple lines of best fit. We have visualized the relationship of the derived binary output with each input using the color on the dot plots. Finally, we have visualized the relationship between each feature to see if they are correlated using a corrplot.